{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# 6.2 Binary Data Formats (二进制数据格式)\n",
    "\n",
    "最简单的以二进制的格式来存储数据的方法（也被叫做serialization，序列化），就是用python内建的pickle。所有的pandas object都有一个to_pickle方法，可以用来存储数据为pickle格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "frame = pd.read_csv('../examples/ex1.csv')\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "frame.to_pickle('../examples/frame_pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "array_archive.npz          \u001b[31mmacrodata.csv\u001b[m\u001b[m\r\n",
      "array_compressed.npz       mydata.csv\r\n",
      "\u001b[31marray_ex.txt\u001b[m\u001b[m               \u001b[31mout.csv\u001b[m\u001b[m\r\n",
      "\u001b[31mcsv_mindex.csv\u001b[m\u001b[m             \u001b[31msegismundo.txt\u001b[m\u001b[m\r\n",
      "\u001b[31mex1.csv\u001b[m\u001b[m                    sink.txt\r\n",
      "\u001b[31mex1.xlsx\u001b[m\u001b[m                   some_array.npy\r\n",
      "\u001b[31mex2.csv\u001b[m\u001b[m                    \u001b[31mspx.csv\u001b[m\u001b[m\r\n",
      "\u001b[31mex3.csv\u001b[m\u001b[m                    \u001b[31mstinkbug.png\u001b[m\u001b[m\r\n",
      "\u001b[31mex3.txt\u001b[m\u001b[m                    \u001b[31mstock_px.csv\u001b[m\u001b[m\r\n",
      "\u001b[31mex4.csv\u001b[m\u001b[m                    \u001b[31mstock_px_2.csv\u001b[m\u001b[m\r\n",
      "\u001b[31mex5.csv\u001b[m\u001b[m                    \u001b[31mtest_file.csv\u001b[m\u001b[m\r\n",
      "\u001b[31mex6.csv\u001b[m\u001b[m                    \u001b[31mtips.csv\u001b[m\u001b[m\r\n",
      "\u001b[31mex7.csv\u001b[m\u001b[m                    tmp.txt\r\n",
      "\u001b[31mexample.json\u001b[m\u001b[m               \u001b[31mtseries.csv\u001b[m\u001b[m\r\n",
      "\u001b[31mfdic_failed_bank_list.html\u001b[m\u001b[m \u001b[31mvolume.csv\u001b[m\u001b[m\r\n",
      "frame_pickle               \u001b[31myahoo_price.pkl\u001b[m\u001b[m\r\n",
      "\u001b[31mipython_bug.py\u001b[m\u001b[m             \u001b[31myahoo_volume.pkl\u001b[m\u001b[m\r\n"
     ]
    }
   ],
   "source": [
    "!ls ../examples/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "用内建的pickle可以直接读取任何pickle文件，或者直接用pandas.read_pickle:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_pickle('../examples/frame_pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "注意：pickle只推荐用于短期存储。因为这种格式无法保证长期稳定；比如今天pickled的一个文件，可能在库文件更新后无法读取。\n",
    "\n",
    "python还支持另外两种二进制数据格式：HDF5和MessagePack。下面会介绍一个HDF5，但是我们鼓励你多尝试一个不同的文件格式，看看他们能有多快，是否符合你数据分析的要求。另外一些可用的存储格式有：bcolz 和 Feather。\n",
    "\n",
    "# 1 Using HDF5 Format\n",
    "\n",
    "HDF5格式是用来存储大量的科学数组数据的。这种格式还能用于其他一些语言。其中HDF表示hierarchical data format。每一个HDF5格式能春初多个数据集，并支持metadata。\n",
    "\n",
    "> 元数据(meta data)——“data about data” 关于数据的数据，一般是结构化数据（如存储在数据库里的数据，规定了字段的长度、类型等）。元数据是指从信息资源中抽取出来的用于说明其特征、内容的结构化的数据(如题名,版本、出版数据、相关说明,包括检索点等)，用于组织、描述、检索、保存、管理信息和知识资源。\n",
    "\n",
    "HDF5 支持多种压缩模式的on-the-fly compression（即时压缩），能让数据中一些重复的部分存储地更有效。HDF5对于处理大数据集是一个很好的选择，因为他不会把所有数据一次性读取到内存里，我们可以从很大的数组中有效率地读取一小部分。\n",
    "\n",
    "能用PyTables或h5py来访问HDF5数据，pandas也有提供一个high-level的交互界面。HDFStore类像dict一样能用来处理low-level细节：\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame = pd.DataFrame({'a': np.random.randn(100)})\n",
    "\n",
    "store = pd.HDFStore('../examples/mydata.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.908968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.589368</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.015493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.029850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.682180</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          a\n",
       "0  0.908968\n",
       "1 -0.589368\n",
       "2  0.015493\n",
       "3  0.029850\n",
       "4  1.682180"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "frame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "one;two;three\r\n",
      "1;2;3\r\n",
      "4;5;6\r\n",
      "7;8;9\r\n"
     ]
    }
   ],
   "source": [
    "cat ../examples/mydata.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<class 'pandas.io.pytables.HDFStore'>\n",
       "File path: ../examples/mydata.h5\n",
       "/obj1                frame        (shape->[100,1])\n",
       "/obj1_col            series       (shape->[100])  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "store['obj1'] = frame\n",
    "\n",
    "store['obj1_col'] = frame['a']\n",
    "\n",
    "store"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "HDF5中的object能用像dict一样的API来提取："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.908968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.589368</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.015493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.029850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.682180</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.977549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.806008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-0.458852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>-0.833007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.410430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>-0.232093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0.002657</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-0.642136</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>-1.772356</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.411567</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1.259846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>-0.730757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>-1.401318</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>2.984850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>-0.140144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>-0.972235</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>-0.592930</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1.538674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>-0.052034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>0.260698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>-0.859609</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>1.119075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>-0.898982</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>0.191869</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>1.013184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>0.507719</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>-0.146967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>0.694807</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>-0.482966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>1.816455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>0.535113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>-1.006997</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>77</th>\n",
       "      <td>2.088429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>-0.449548</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79</th>\n",
       "      <td>1.133784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80</th>\n",
       "      <td>0.413018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>0.324538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>1.434230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>0.767194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>2.073305</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>-2.643750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>2.330771</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>-1.683786</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>1.631209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>89</th>\n",
       "      <td>0.036383</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90</th>\n",
       "      <td>0.755447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>-0.712269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>-0.914963</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>-0.660928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>1.465104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>0.101968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>0.669502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>-0.210062</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>-0.453599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>0.848008</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           a\n",
       "0   0.908968\n",
       "1  -0.589368\n",
       "2   0.015493\n",
       "3   0.029850\n",
       "4   1.682180\n",
       "5  -0.977549\n",
       "6   0.806008\n",
       "7  -0.458852\n",
       "8  -0.833007\n",
       "9   0.410430\n",
       "10 -0.232093\n",
       "11  0.002657\n",
       "12 -0.642136\n",
       "13 -1.772356\n",
       "14  0.411567\n",
       "15  1.259846\n",
       "16 -0.730757\n",
       "17 -1.401318\n",
       "18  2.984850\n",
       "19 -0.140144\n",
       "20 -0.972235\n",
       "21 -0.592930\n",
       "22  1.538674\n",
       "23 -0.052034\n",
       "24  0.260698\n",
       "25 -0.859609\n",
       "26  1.119075\n",
       "27 -0.898982\n",
       "28  0.191869\n",
       "29  1.013184\n",
       "..       ...\n",
       "70  0.507719\n",
       "71 -0.146967\n",
       "72  0.694807\n",
       "73 -0.482966\n",
       "74  1.816455\n",
       "75  0.535113\n",
       "76 -1.006997\n",
       "77  2.088429\n",
       "78 -0.449548\n",
       "79  1.133784\n",
       "80  0.413018\n",
       "81  0.324538\n",
       "82  1.434230\n",
       "83  0.767194\n",
       "84  2.073305\n",
       "85 -2.643750\n",
       "86  2.330771\n",
       "87 -1.683786\n",
       "88  1.631209\n",
       "89  0.036383\n",
       "90  0.755447\n",
       "91 -0.712269\n",
       "92 -0.914963\n",
       "93 -0.660928\n",
       "94  1.465104\n",
       "95  0.101968\n",
       "96  0.669502\n",
       "97 -0.210062\n",
       "98 -0.453599\n",
       "99  0.848008\n",
       "\n",
       "[100 rows x 1 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "store['obj1']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "HDFStore支持两种存储架构，fixed和table。后者通常更慢一些，但支持查询操作："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "store.put('obj2', frame, format='table')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>-1.186667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>0.628998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-0.668921</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>0.775731</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>-1.329899</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1.920832</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           a\n",
       "10 -1.186667\n",
       "11  0.628998\n",
       "12 -0.668921\n",
       "13  0.775731\n",
       "14 -1.329899\n",
       "15  1.920832"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "store.select('obj2', where=['index >= 10 and index <= 15'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "put是存储的另一种写法，类似于之前的`store['obj2'] = frame`，但这种协防能让我们设置存储格式。\n",
    "\n",
    "pandas.read_hdf函数也很方便："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "frame.to_hdf('../examples/mydata.h5', 'obj3', format='table')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.324210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.770813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.554799</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1.741867</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.875071</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          a\n",
       "0  1.324210\n",
       "1  0.770813\n",
       "2  0.554799\n",
       "3 -1.741867\n",
       "4 -1.875071"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_hdf('../examples/mydata.h5', 'obj3', where=['index < 5'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "注意：如果我们是把数据存在远端服务器上，比如Amazon S3或HDFS，使用一些为分布式存储实际的二进制格式会更适合一些，比如Apache Parquet。\n",
    "\n",
    "如果是在本地处理很大数据量的话，推荐尝试PyTables和h5py看是否符合你的要求。因为很多数据分析问题都受限于I/O，所以用HDF5这样的工具能加快应用。\n",
    "\n",
    "注意：HDF5不是数据库（database）。它最适合一次写入，多次读取的数据库。尽管数据可以在任何时间多次写入一个文件，如果多个使用者同时写入的话，文件会被破坏。\n",
    "\n",
    "# 2 Reading Microsoft Excel Files（读取微软的excel文件）\n",
    "\n",
    "pandas支持读取表格型数据（excel 2003或更高）文件，使用ExcelFile class或pandas.read_excel函数。这些工具需要一些富家的包xlrd和openpyxl来分别读取XLS和XLSX文件。你可以通过pip或conda来安装。\n",
    "\n",
    "使用ExcelFile，创建一个instance，通过给xls或xlsx一个路径："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "xlsx = pd.ExcelFile('../examples/ex1.xlsx')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "保存在sheet里的数据，可以通过parse来读取为DataFrame："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_excel(xlsx, 'Sheet1')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果要读取一个文件中的多个sheet，用ExcelFile会更快。但让，你也能把文件名直接传递给pandas.read_excel:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "frame = pd.read_excel('../examples/ex1.xlsx', 'Sheet1')\n",
    "frame"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果要把pandas数据写为Excel格式，你必须先创建一个ExcelWrite，然后用to_excel方法："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "writer = pd.ExcelWriter('../examples/ex2.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame.to_excel(writer, 'Sheet1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "writer.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果不适用ExcelWriter的话，可以直接传给to_excel一个path："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "frame.to_excel('../examples/ex2.xlsx')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [py35]",
   "language": "python",
   "name": "Python [py35]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
